Intro and preliminaries

If you haven’t already, install these packages (you don’t have to do this every time), so if you need to, uncomment the line below and run it.

#install.packages(c("tidyverse","knitr", "Hmisc"))

Then you load them with the “library” command. Confusingly, when you load the tidyverse library, some of its sub-libraries automatically load, and others need to be separately loaded (e.g. broom).

library(tidyverse)
## ── Attaching packages ──────────────────────────────────────────────────────────────────────────── tidyverse 1.2.1 ──
## ✔ ggplot2 2.2.1     ✔ purrr   0.2.5
## ✔ tibble  1.4.2     ✔ dplyr   0.7.5
## ✔ tidyr   0.8.1     ✔ stringr 1.3.1
## ✔ readr   1.1.1     ✔ forcats 0.3.0
## ── Conflicts ─────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
library(knitr)
library(broom)
library(Hmisc)
## Loading required package: lattice
## Loading required package: survival
## Loading required package: Formula
## 
## Attaching package: 'Hmisc'
## The following objects are masked from 'package:dplyr':
## 
##     src, summarize
## The following objects are masked from 'package:base':
## 
##     format.pval, units
# some useful settings (options) ------------------------------------
options(tibble.width = 300,
        dplyr.width = 300)
# these make datasets easier to see when they get displayed on screen
# later, you can mess with them and see what they do if you want.

Reminder: how to get help from R. Put a question mark in front of a function or built-in/loaded dataset, and help will appear!

?mean
?diamonds
#(mean is a function, diamonds is a dataset)

You can get in-line help by pressing tab as you go: R will autocomplete what you’re typing within the function it will give you hints about the arguments the function takes try it out by typing mean( in the console below and then hitting tab.

Data preprocessing

Reading in data

you probably already read in the data in the intro script, but if you’re just jumping in

ma_data <- read_csv("datasets/mental_abacus_data.csv")
## Parsed with column specification:
## cols(
##   subid = col_character(),
##   class_num = col_character(),
##   grade = col_character(),
##   group = col_character(),
##   year = col_integer(),
##   ravens = col_integer(),
##   gonogo = col_double(),
##   swm = col_double(),
##   pvAvg = col_double(),
##   arithmeticTotal = col_integer(),
##   arithmeticAverage = col_double(),
##   woodcockTotal = col_integer()
## )
ps_data <- read_csv("datasets/pragmatic_scales_data.csv")
## Parsed with column specification:
## cols(
##   subid = col_character(),
##   item = col_character(),
##   correct = col_integer(),
##   age = col_double(),
##   condition = col_character()
## )

remember, you can use summary, glimpse, and View to remind yourself what these data files look like (always good to be very careful with that!)

summary(ma_data)
##     subid            class_num            grade          
##  Length:328         Length:328         Length:328        
##  Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character  
##                                                          
##                                                          
##                                                          
##                                                          
##     group                year          ravens          gonogo      
##  Length:328         Min.   :2015   Min.   : 0.00   Min.   :0.2769  
##  Class :character   1st Qu.:2015   1st Qu.: 6.00   1st Qu.:0.7231  
##  Mode  :character   Median :2016   Median :10.00   Median :0.8154  
##                     Mean   :2016   Mean   :11.09   Mean   :0.7899  
##                     3rd Qu.:2016   3rd Qu.:15.00   3rd Qu.:0.8692  
##                     Max.   :2016   Max.   :25.00   Max.   :0.9923  
##                                                    NA's   :1       
##       swm            pvAvg        arithmeticTotal  arithmeticAverage
##  Min.   :1.125   Min.   :0.0000   Min.   : 0.000   Min.   :0.0000   
##  1st Qu.:2.767   1st Qu.:0.0900   1st Qu.: 4.000   1st Qu.:0.0800   
##  Median :3.777   Median :0.3600   Median : 8.000   Median :0.1700   
##  Mean   :3.818   Mean   :0.3649   Mean   : 8.419   Mean   :0.1751   
##  3rd Qu.:4.753   3rd Qu.:0.6400   3rd Qu.:12.000   3rd Qu.:0.2500   
##  Max.   :8.714   Max.   :1.0000   Max.   :29.000   Max.   :0.6000   
##  NA's   :4       NA's   :1        NA's   :1        NA's   :1        
##  woodcockTotal 
##  Min.   : 0.0  
##  1st Qu.: 8.0  
##  Median :11.0  
##  Mean   :10.5  
##  3rd Qu.:13.0  
##  Max.   :20.0  
## 
# View(ps_data)
glimpse(ma_data)
## Observations: 328
## Variables: 12
## $ subid             <chr> "S1-02-03", "S1-02-03", "S1-02-08", "S1-02-08", "S1-02-17", "S1-02-17", "S1-03-05", "S1-03-05", "S1-03-14", "S1-03-14", "S1-03-15", "S1-03-15", "S1-04-01", "S1-04-01", "S1-04-03", "S1-04-03", "S1-04-04", "S1-04-04", "S1-04-07", "S1-04-07", "S1-04-08", "S1-04-08", "S1-0...
## $ class_num         <chr> "S1_02", "S1_02", "S1_02", "S1_02", "S1_02", "S1_02", "S1_03", "S1_03", "S1_03", "S1_03", "S1_03", "S1_03", "S1_04", "S1_04", "S1_04", "S1_04", "S1_04", "S1_04", "S1_04", "S1_04", "S1_04", "S1_04", "S1_04", "S1_04", "S1_04", "S1_04", "S1_04", "S1_04", "S1_07", "S1_07",...
## $ grade             <chr> "first grade", "first grade", "first grade", "first grade", "first grade", "first grade", "first grade", "first grade", "first grade", "first grade", "first grade", "first grade", "first grade", "first grade", "first grade", "first grade", "first grade", "first grade",...
## $ group             <chr> "CNTL", "CNTL", "CNTL", "CNTL", "CNTL", "CNTL", "MA", "MA", "MA", "MA", "MA", "MA", "MA", "MA", "MA", "MA", "MA", "MA", "MA", "MA", "MA", "MA", "MA", "MA", "MA", "MA", "MA", "MA", "CNTL", "CNTL", "CNTL", "CNTL", "CNTL", "CNTL", "CNTL", "CNTL", "CNTL", "CNTL", "CNTL", "...
## $ year              <int> 2015, 2016, 2015, 2016, 2015, 2016, 2015, 2016, 2015, 2016, 2015, 2016, 2015, 2016, 2015, 2016, 2015, 2016, 2015, 2016, 2015, 2016, 2015, 2016, 2015, 2016, 2015, 2016, 2015, 2016, 2015, 2016, 2015, 2016, 2015, 2016, 2015, 2016, 2015, 2016, 2015, 2016, 2015, 2016, 2015,...
## $ ravens            <int> 3, 15, 4, 12, 5, 10, 8, 18, 8, 8, 10, 14, 9, 21, 7, 4, 6, 6, 13, 7, 8, 15, 8, 25, 17, 21, 4, 15, 9, 12, 4, 3, 0, 10, 7, 22, 9, 10, 5, 2, 10, 14, 12, 15, 12, 9, 3, 6, 2, 4, 4, 4, 4, 11, 13, 4, 5, 8, 7, 4, 7, 9, 3, 5, 8, 18, 6, 11, 6, 7, 5, 9, 9, 12, 7, 3, 8, 2, 13, 3, 7...
## $ gonogo            <dbl> 0.8153846, 0.7076923, 0.7615385, 0.7076923, 0.4307692, 0.7307692, 0.6230769, 0.5615385, 0.7615385, 0.7846154, 0.3615385, 0.7307692, 0.9307692, 0.8615385, 0.7384615, 0.8307692, 0.7230769, 0.7923077, 0.7786260, 0.7307692, 0.8692308, 0.8538462, 0.7538462, 0.8384615, 0.538...
## $ swm               <dbl> 1.833333, 2.125000, 1.625000, 1.375000, 1.500000, 4.375000, 2.818182, 5.208333, 3.083333, 3.583333, 1.125000, 2.791667, 4.375000, 3.583333, 3.437500, 4.500000, 2.416667, 5.600000, NA, 3.625000, 2.750000, 4.312500, 1.416667, 3.255319, 4.117647, 5.083333, 3.409091, 5.777...
## $ pvAvg             <dbl> 0.00, 0.36, 0.00, 0.36, 0.09, NA, 0.00, 0.91, 0.00, 0.27, 0.00, 0.27, 0.00, 0.82, 0.00, 0.36, 0.00, 0.18, 0.00, 0.27, 0.00, 0.36, 0.00, 0.09, 0.09, 0.45, 0.00, 0.18, 0.00, 1.00, 0.00, 0.18, 0.00, 0.36, 0.27, 0.36, 0.18, 0.18, 0.00, 0.36, 0.09, 0.09, 0.00, 0.09, 0.00, 0...
## $ arithmeticTotal   <int> 0, 12, 0, 4, 8, NA, 4, 16, 3, 8, 0, 4, 0, 13, 3, 10, 3, 6, 1, 4, 3, 6, 2, 5, 4, 8, 2, 12, 3, 15, 2, 7, 1, 5, 4, 8, 3, 4, 2, 5, 3, 9, 3, 8, 3, 9, 0, 6, 1, 9, 0, 13, 2, 12, 3, 13, 3, 10, 3, 10, 3, 12, 3, 9, 4, 13, 2, 3, 3, 5, 3, 6, 1, 6, 0, 5, 1, 4, 1, 14, 0, 4, 2, 5, 1,...
## $ arithmeticAverage <dbl> 0.00, 0.25, 0.00, 0.08, 0.17, NA, 0.08, 0.33, 0.06, 0.17, 0.00, 0.08, 0.00, 0.27, 0.06, 0.21, 0.06, 0.13, 0.02, 0.08, 0.06, 0.13, 0.04, 0.10, 0.08, 0.17, 0.04, 0.25, 0.06, 0.31, 0.04, 0.15, 0.02, 0.10, 0.08, 0.17, 0.06, 0.08, 0.04, 0.10, 0.06, 0.19, 0.06, 0.17, 0.06, 0...
## $ woodcockTotal     <int> 2, 15, 3, 9, 9, 10, 5, 13, 6, 9, 5, 11, 0, 10, 5, 11, 5, 7, 4, 12, 7, 11, 5, 10, 7, 13, 4, 11, 4, 13, 4, 11, 3, 8, 5, 8, 7, 10, 3, 10, 6, 8, 5, 11, 5, 11, 3, 11, 4, 9, 4, 11, 5, 12, 9, 12, 5, 11, 5, 10, 5, 11, 5, 13, 5, 13, 7, 10, 5, 10, 5, 9, 8, 12, 1, 8, 8, 8, 5, 13,...

let’s just make 2 simple aggregated version of this dataset, by subj & items

Review of dyplr

ps_data_bysubj_cond <- ps_data %>% #take your dataset
  group_by(subid, condition) %>% #retain subject and condition, collapse everything else, i.e. item 
  summarise(mean_corr = mean(correct, na.rm = TRUE),#create mean for each subj,cond
            sum_corr = sum(correct, na.rm = TRUE))# create sum for each subj,cond

ps_data_byitem_cond <- ps_data %>% #take your dataset
  group_by(item, condition) %>% #retain subject and item, collapse everything else, i.e. subj 
  summarise(mean_corr = mean(correct, na.rm = TRUE),#create mean for each item,cond
            sum_corr = sum(correct, na.rm = TRUE))# create sum for each item,cond

protip: commenting what every single line does is great practice when you’re stuck!

On to graphing!

Scatterplots!

Check out iris.

?iris

Let’s plot.

ggplot(data = iris)+
   geom_point(mapping = aes(x=Sepal.Length,
                            y =Species))

Now use the same approach on ps_data.

ggplot(data = ps_data_byitem_cond)+
  geom_point(mapping = aes(x=condition,
                           y =sum_corr))

ps_data_byitem_cond
## # A tibble: 8 x 4
## # Groups:   item [?]
##   item   condition mean_corr sum_corr
##   <chr>  <chr>         <dbl>    <int>
## 1 beds   Label         0.800       60
## 2 beds   No Label      0.208       15
## 3 faces  Label         0.653       49
## 4 faces  No Label      0.181       13
## 5 houses Label         0.547       41
## 6 houses No Label      0.167       12
## 7 pasta  Label         0.733       55
## 8 pasta  No Label      0.250       18

What’s wrong with this graph?

ggplot(data = ma_data)+
  geom_point(mapping = aes(x=grade,
                           y =arithmeticAverage))
## Warning: Removed 1 rows containing missing values (geom_point).

ma_data
## # A tibble: 328 x 12
##    subid    class_num grade       group  year ravens gonogo   swm   pvAvg
##    <chr>    <chr>     <chr>       <chr> <int>  <int>  <dbl> <dbl>   <dbl>
##  1 S1-02-03 S1_02     first grade CNTL   2015      3  0.815  1.83  0.    
##  2 S1-02-03 S1_02     first grade CNTL   2016     15  0.708  2.12  0.360 
##  3 S1-02-08 S1_02     first grade CNTL   2015      4  0.762  1.62  0.    
##  4 S1-02-08 S1_02     first grade CNTL   2016     12  0.708  1.38  0.360 
##  5 S1-02-17 S1_02     first grade CNTL   2015      5  0.431  1.50  0.0900
##  6 S1-02-17 S1_02     first grade CNTL   2016     10  0.731  4.38 NA     
##  7 S1-03-05 S1_03     first grade MA     2015      8  0.623  2.82  0.    
##  8 S1-03-05 S1_03     first grade MA     2016     18  0.562  5.21  0.910 
##  9 S1-03-14 S1_03     first grade MA     2015      8  0.762  3.08  0.    
## 10 S1-03-14 S1_03     first grade MA     2016      8  0.785  3.58  0.270 
##    arithmeticTotal arithmeticAverage woodcockTotal
##              <int>             <dbl>         <int>
##  1               0            0.                 2
##  2              12            0.250             15
##  3               0            0.                 3
##  4               4            0.0800             9
##  5               8            0.170              9
##  6              NA           NA                 10
##  7               4            0.0800             5
##  8              16            0.330             13
##  9               3            0.0600             6
## 10               8            0.170              9
## # ... with 318 more rows

jitter those points!

one thing you should always ask yourself is: how many points,bars, lines should i be seeing?

ggplot(data = ma_data)+
  geom_jitter(mapping = aes(x = grade,
                            y = arithmeticAverage))
## Warning: Removed 1 rows containing missing values (geom_point).

hm, that’s better, but now it feels a little TOO spread out, let’s reign it in

ggplot(data = ma_data)+
  geom_jitter(mapping = aes(x=grade,
                            y =arithmeticAverage),
              width = .2,
              height = 0)
## Warning: Removed 1 rows containing missing values (geom_point).

Exercise. Task 1. First scatterplots.

    1. using the ps_data_byitem_cond, make a scatterplot of mean correct (x axis) by condition (y axis)
    1. using the built-in iris dataset, make a scatterplot of Species by Petal.Width
    1. using ps_data_bysubj_condition, make a scatterplot of sum correct by condition where you can appropriately see the dots

aesthetics

ggplot(data = iris)+
  geom_point(mapping = aes(x=Sepal.Length,
                           y =Petal.Width,
                           color = Species))

ggplot(data = iris)+
  geom_point(mapping = aes(x=Sepal.Length,
                           y =Petal.Width,
                           alpha = Species))

ggplot(data = iris)+
  geom_point(mapping = aes(x=Sepal.Length,
                           y =Petal.Width,
                           shape = Species,
                           alpha = Species))

ggplot(data = iris)+
  geom_point(mapping = aes(x=Sepal.Length,
                           y =Petal.Width,
                           size = Species))
## Warning: Using size for a discrete variable is not advised.

ggplot(data = iris)+
  geom_point(mapping = aes(x=Sepal.Length,
                           y =Petal.Width),
             size = 4)

ggplot(data = iris)+
  geom_point(mapping = aes(x=Sepal.Length,
                           y =Petal.Width))+
  facet_wrap(facets = ~Species)

Exercise. For Task 2, use ma_data and a scatterplot of your choosing (jittered if needed!).

    1. set the shape of all the dots in a scatterplot to an asterisk
    1. map a continuous variable onto color (hint: use ‘summary’ to see what’s continuous!)
    1. map a discrete variable (a factor or character) onto shape
    1. map a continuous variable (an integer or double) onto shape
    1. make a graph of your choosing using facet_wrap
    1. advanced: make a graph of your choosing using facet_wrap AND one with facet_grid:what’s the difference?

Moving forward: Other geoms

geom_line graph (we refer to this graph below in task 3c)

ggplot(data = ma_data, aes(x= factor(year),#this just makes it treat year as a factor
                           y= arithmeticAverage, 
                           group = subid))+# group keeps the 'unit' at subid
  geom_point()+
  geom_line()
## Warning: Removed 1 rows containing missing values (geom_point).
## Warning: Removed 1 rows containing missing values (geom_path).

geom_hline geom_text

ggplot(data = ps_data_byitem_cond, mapping = aes(x=condition,
                                                 y=mean_corr))+
  geom_point()+

  geom_hline(yintercept = .5)+ #hey, this adds a line!
  geom_text(label = "1b", x = .7, y= .2, color = "purple")# this but '1b' in the corner!

the x and y tell it where to put the text, here label is 1 on the x axis

Visualizing distributions

in 1d; we refer to this graph in task 3d

ggplot(data = ma_data, aes(x=gonogo))+
  geom_histogram(binwidth=.10)
## Warning: Removed 1 rows containing non-finite values (stat_bin).

in 2d

ggplot(data = ps_data_bysubj_cond, aes(x=condition, y = mean_corr))+
  geom_boxplot()

with density info:

ggplot(data = ps_data_bysubj_cond, aes(x=condition, y = mean_corr))+
  geom_violin()

with density AND dots!

ggplot(data = ps_data_bysubj_cond, aes(x=condition, y = mean_corr))+
  geom_violin()+
  geom_jitter(width=.1, height=.01, shape =1)# i like shape #1 for legibility

statistical transformation: smoothers

(and examples of ‘local’ vs. ‘global’ variable setting)

global x and y, color just for geom_point

ggplot(data = ma_data, mapping = aes(x = arithmeticTotal, y = gonogo)) + 
  geom_point(mapping = aes(color = grade)) + 
  stat_smooth()
## `geom_smooth()` using method = 'loess'
## Warning: Removed 2 rows containing non-finite values (stat_smooth).
## Warning: Removed 2 rows containing missing values (geom_point).

all vars global: what’s the difference?

ggplot(data = ma_data, mapping = aes(x = arithmeticTotal, y = gonogo, color = grade)) + 
  geom_point() + 
  stat_smooth()
## `geom_smooth()` using method = 'loess'
## Warning: Removed 2 rows containing non-finite values (stat_smooth).
## Warning: Removed 2 rows containing missing values (geom_point).

filter the data for a layer

ggplot(data = ma_data, mapping = aes(x = arithmeticTotal, y = gonogo, color = grade)) + 
  geom_point() + 
  stat_smooth(data = filter(ma_data,grade=="first grade"))# the smoother only gets grade 1 data!
## `geom_smooth()` using method = 'loess'
## Warning: Removed 2 rows containing non-finite values (stat_smooth).
## Warning: Removed 2 rows containing missing values (geom_point).

take out a class, remove confidence bnd

ggplot(data = ma_data, mapping = aes(x = arithmeticTotal, y = gonogo, color = grade)) + 
  geom_point() + #the points include everyone
  stat_smooth(data = filter(ma_data,group != "MA"), 
              se = FALSE) # but the smoother doesn't see MA group
## `geom_smooth()` using method = 'loess'
## Warning: Removed 2 rows containing non-finite values (stat_smooth).
## Warning: Removed 2 rows containing missing values (geom_point).

what does se = FALSE do?

stat_smooth default is loess (local estimator)

ggplot(data = ma_data, mapping = aes(x = arithmeticTotal, y = gonogo)) + 
  geom_point(aes(color = grade)) + 
  stat_smooth()
## `geom_smooth()` using method = 'loess'
## Warning: Removed 2 rows containing non-finite values (stat_smooth).
## Warning: Removed 2 rows containing missing values (geom_point).

but you can make it fit a line

ggplot(data = ma_data, mapping = aes(x = arithmeticTotal, y = gonogo)) + 
  geom_point(aes(color = grade)) + 
  stat_smooth( method="lm")
## Warning: Removed 2 rows containing non-finite values (stat_smooth).
## Warning: Removed 2 rows containing missing values (geom_point).

Exercise. Task 3. Geoms, distributions, and smoothers.

  • a): go back to one of the scatter plots from #1 and add a loess smooth, and a line
  • b): using ma_data, make a boxplot of swm for every value of woodcockTotal
  • c): go back to the geom_line graph above and separate the data by grade (multiple solutions!)
  • d): more advanced: come up with a solution so that the histogram only has each subject represented 1x

Adding error bars

mean by condition, no error bars yet

ggplot(data = ps_data, aes(x = condition, y = correct)) + 
  stat_summary(fun.y=mean, 
               na.rm=T, 
               geom = "bar")

barbarplots? [cf twitter]

95% confidence interval

ggplot(data = ps_data, aes(x = condition, y = correct)) + 
  stat_summary(fun.data = mean_cl_boot, geom = "pointrange") #fun.data, not fun.y!

mean_cl_boot is boostrapped confidence intervals, you can google what regular normal CIs would be!

both:

ggplot(data = ps_data, aes(x = condition, y = correct)) + 
  stat_summary(fun.y = mean, na.rm=T, geom = "bar")+
  stat_summary(fun.data = mean_cl_boot, geom = "pointrange")

errors bars with violins

same as violin plot above, but now with an errorbar!

ggplot(data = ps_data_bysubj_cond, aes(x=condition, y = mean_corr))+
  geom_violin()+
  stat_summary(fun.data=mean_cl_normal, geom = "pointrange")

stack and dodge

protip: use fill with bars not colour!

bonus question: what does colour do for bars?

ggplot(data = ma_data) + 
  geom_bar(mapping = aes(x = woodcockTotal, fill = grade), position = "fill")

ggplot(data = ma_data) + 
  geom_bar(mapping = aes(x = woodcockTotal, fill = grade), position = "stack")

ggplot(data = ma_data) + 
  geom_bar(mapping = aes(x = woodcockTotal, fill = grade), position = "dodge")

Exercise. Task 4: error bars, and stack & dodge

  • a): using the ps dataset, graph means for each item & add error bars
  • b): make a bargraph of the ps_data_byitem_cond showing the mean_corr for each condition using geom_bar (hint, you’ll need to use “stat=” insde your geom_bar() call
  • c): when would it be most appropriate to use fill, stack, or dodge?

Saving your graph

?ggsave()

ggsave will save your last plot by default, but you can also tell it save a plot you’ve assigned.

mygraph <- ggplot(data = iris)+
  geom_point(mapping = aes(x=Sepal.Length,
                           y =Petal.Width,
                           color = Species))

mygraph

ggsave("mygraph.pdf",plot = mygraph,dpi = 100)
## Saving 7 x 5 in image

even better than saving your graph: add it to your R Markdown! the awesome thing about using your .Rmd file is that you can render graphs there, and they get saved for you!

there are LOTS of settings you can muck with. (details here https://yihui.name/knitr/options/#plots). we’ll do this back in our .rmd file

Graph Wishes

Exercise. Task 5. Split into groups for task wishes.

  • Group A: Individual datapoints + summary stats.
  • Group B: Distribution-based Wishes.
  • Group C: Time-course graph based wishes.

hint for group a

ggplot(data = ma_data, aes(x= factor(year),#this just makes it treat year as a factor
                           y= arithmeticAverage, 
                           group = subid))+# this keeps the 'unit' at subid
  geom_point()+
  geom_line()+facet_wrap(~grade)+
  stat_summary(color = "red", size = 3, geom="line", fun.y=mean, aes(group =grade))
## Warning: Removed 1 rows containing non-finite values (stat_summary).
## Warning: Removed 1 rows containing missing values (geom_point).
## Warning: Removed 1 rows containing missing values (geom_path).

hint 1 for group b

xvar <- c(rnorm(1500, mean = -1), rnorm(1500, mean = 1.5))
yvar <- c(rnorm(1500, mean = 1), rnorm(1500, mean = 1.5))
zvar <- as.factor(c(rep(1, 1500), rep(2, 1500)))
xy <- data.frame(xvar, yvar, zvar)
ggplot(xy, aes(xvar, yvar)) + geom_point() + geom_rug(col = "darkred", alpha = 0.1)

further hints: here and here

hint 2 for Group b

all the code for this graph appears to be here, BUT this person did not do things the tidy way! https://micahallen.org/2018/03/15/introducing-raincloud-plots/. exercise for the reader: do his wrangling the tidy way!

but using our ps_data_bysubj_cond and sourcing this:

source("https://gist.githubusercontent.com/benmarwick/2a1bb0133ff568cbe28d/raw/fb53bd97121f7f9ce947837ef1a4c65a73bffb3f/geom_flat_violin.R")

you should be able to make your raincloud:)

hint for group c: here’s a sample dataset and a graph to get you started in the right direction

library(feather)# this is part of tidyverse, but not auto-loaded

feather is a format that’s convenient for various reasons

coart<- read_feather("datasets/coart_test")
summary(coart)
##     timeBin        FixationID   SubjectNumber            gaze       
##  Min.   :  0.0   Min.   :   1   a01    : 10893   DISTRACTOR: 56765  
##  1st Qu.:103.0   1st Qu.:2232   a10    : 10757   TARGET    :114850  
##  Median :203.0   Median :4848   a14    : 10485   NA's      : 21748  
##  Mean   :203.9   Mean   :4757   a05    : 10446                      
##  3rd Qu.:303.0   3rd Qu.:7146   a17    : 10418                      
##  Max.   :653.0   Max.   :9160   a11    : 10380                      
##                                 (Other):129984                      
##      Trial             RT     TRIAL_START_TIME   audiotarget       
##  Min.   : 1.00   Min.   :-1   Min.   :  205001   Length:193363     
##  1st Qu.: 7.00   1st Qu.:-1   1st Qu.: 1500796   Class :character  
##  Median :13.00   Median :-1   Median : 6501502   Mode  :character  
##  Mean   :12.58   Mean   :-1   Mean   :15150737                     
##  3rd Qu.:19.00   3rd Qu.:-1   3rd Qu.:10383920                     
##  Max.   :24.00   Max.   :-1   Max.   :69109516                     
##  NA's   :20171                                                     
##    carrier          distractorimage    TargetImage       
##  Length:193363      Length:193363      Length:193363     
##  Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character  
##                                                          
##                                                          
##                                                          
##                                                          
##  distractorloc       targetloc             Pair          
##  Length:193363      Length:193363      Length:193363     
##  Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character  
##                                                          
##                                                          
##                                                          
##                                                          
##     first              second          targetwordonset      order     
##  Length:193363      Length:193363      Min.   :   0.0   Min.   :1.00  
##  Class :character   Class :character   1st Qu.: 690.0   1st Qu.:1.00  
##  Mode  :character   Mode  :character   Median : 828.0   Median :1.00  
##                                        Mean   : 820.6   Mean   :1.49  
##                                        3rd Qu.:1141.0   3rd Qu.:2.00  
##                                        Max.   :1276.0   Max.   :2.00  
##                                                                       
##   targetside        TrialType          Time           propt      
##  Length:193363      cross:57652   Min.   :    0   Min.   :0.000  
##  Class :character   NA   :20171   1st Qu.: 2060   1st Qu.:0.000  
##  Mode  :character   same :57508   Median : 4060   Median :1.000  
##                     unfam:58032   Mean   : 4077   Mean   :0.669  
##                                   3rd Qu.: 6060   3rd Qu.:1.000  
##                                   Max.   :13060   Max.   :1.000  
##                                                   NA's   :21748  
##     targetnum       TargetOnset       Nonset        prewin     longwin   
##  egg.png : 15128   Min.   :2500   Min.   :-3760.0   N:115674   N: 86944  
##  bed.png : 15051   1st Qu.:3190   1st Qu.:-1200.0   Y: 77689   Y:106419  
##  dog.png : 14589   Median :3328   Median :  780.0                        
##  foot.png: 14575   Mean   :3321   Mean   :  765.3                        
##  hand.png: 14271   3rd Qu.:3641   3rd Qu.: 2720.0                        
##  crib.png: 14114   Max.   :3776   Max.   :10560.0                        
##  (Other) :105635                                                         
##  whichwin_long    medwin      whichwin_med    shortwin   whichwin_short  
##  long   :106419   N:116193   med    :106419   N:152767   neither:  9255  
##  neither:  9255   Y: 77170   neither:  9255   Y: 40596   pre    : 77689  
##  pre    : 77689              pre    : 77689              short  :106419  
##                                                                          
##                                                                          
##                                                                          
##                                                                          
##   adult_type       
##  Length:193363     
##  Class :character  
##  Mode  :character  
##                    
##                    
##                    
## 

do you know what each of these lines do? can you make errorbars?

ggplot(subset(coart, Nonset<5000 & Nonset>-1500),
       aes(Nonset, propt, color = TrialType))+
  geom_hline(yintercept=.5)+
  ylab("proportion of target looking")+
  xlab("time from target onset")+
  geom_vline(xintercept=0)+
  stat_smooth(geom="point")+
  theme_bw(base_size=18)
## `geom_smooth()` using method = 'gam'
## Warning: Removed 17424 rows containing non-finite values (stat_smooth).

Extras for the curious

adding regression line

if all we wanted to do was add a regression line, we’d just use stat_smooth: note this is like the graph we did with the errorbars above, just edited a little

ggplot(ToothGrowth, aes(x=dose, y=len, colour=supp)) + 
  stat_summary(fun.y = mean, geom = "point",  size = 4) +
  geom_point( size = 1)+
  stat_smooth(method="lm")

but if we want to know what the actual formula for that line is, we have to calculate some things:

first we need a linear model

ourmodel <- lm(data = ToothGrowth, len~dose*supp)

if you want to know more about the results you do a summary of the model

summary(ourmodel)
## 
## Call:
## lm(formula = len ~ dose * supp, data = ToothGrowth)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -8.2264 -2.8462  0.0504  2.2893  7.9386 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   11.550      1.581   7.304 1.09e-09 ***
## dose           7.811      1.195   6.534 2.03e-08 ***
## suppVC        -8.255      2.236  -3.691 0.000507 ***
## dose:suppVC    3.904      1.691   2.309 0.024631 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.083 on 56 degrees of freedom
## Multiple R-squared:  0.7296, Adjusted R-squared:  0.7151 
## F-statistic: 50.36 on 3 and 56 DF,  p-value: 6.521e-16

if you want the summary results to look prettier you tidy the model

tidy(ourmodel)
##          term  estimate std.error statistic      p.value
## 1 (Intercept) 11.550000  1.581394  7.303681 1.089558e-09
## 2        dose  7.811429  1.195422  6.534454 2.027753e-08
## 3      suppVC -8.255000  2.236429 -3.691152 5.073393e-04
## 4 dose:suppVC  3.904286  1.690582  2.309433 2.463136e-02

in our case, we can use the results of the model to manually put in a line, but there are fancier ways to do this that are beyond the scope of this tutorial

ggplot(ToothGrowth, aes(x=dose, y=len, colour=supp)) + 
  stat_summary(fun.y = mean, geom = "point",  size = 4) +
  geom_point( size = 1)+
  stat_smooth(method="lm")+
  annotate(x = 1, y = 30, "text", label = "y = 11.55 + 7.8 *dose + -8.26*suppVC + 3.9 * dose * suppVC")

note: for annotate, the x and y is where on the graph you want your text to go

if you want to check this formula, you can plug in some values:

#11.55 + 7.8 *dose + -8.26*suppVC + 3.9 * dose * suppVC
11.55 + 7.8*0 + -8.26*0 + 3.9*0*0 # dose of 0 for oj
## [1] 11.55
11.55 + 7.8*1 + -8.26*0 + 3.9*1*0# dose of 1 for oj
## [1] 19.35
11.55 + 7.8*2 + -8.26*0 + 3.9*2*0# dose of 2 for oj
## [1] 27.15
11.55 + 7.8*0 + -8.26*1 + 3.9*0*1# dose of 0 for vc
## [1] 3.29
11.55 + 7.8*1 + -8.26*1 + 3.9*1*1# dose of 1 for vc
## [1] 14.99
11.55 + 7.8*2 + -8.26*1 + 3.9*2*1# dose of 1 for vc
## [1] 26.69

manually specified errorbars

From: http://www.cookbook-r.com/Graphs/Plotting_means_and_error_bars_(ggplot2)/

tgc <- ToothGrowth%>%
  group_by(supp, dose)%>%
  summarise(n = n(),
            mean_len = mean(len),
            sd_len = sd(len),
            se_len = sd_len/sqrt(n),
            ci_len = qt((.95/2 +.5),
                        df= n-1)*se_len) # looking up 95%'s 2 tails in t-dist

ggplot(tgc, aes(x=dose, y=mean_len, colour=supp)) + 
  geom_errorbar(aes(ymin=mean_len - ci_len, ymax = mean_len + ci_len)) +
  geom_line() +
  geom_point()

ggplot(ToothGrowth, aes(x=dose, y=len, colour=supp)) + 
  stat_summary(fun.data = mean_cl_normal, geom = "errorbar") +
  stat_summary(fun.y = mean, geom = "point") +
  stat_summary(fun.y = mean, geom = "line")